
### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: run below line
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: prepares the global environment for all other scripts in the project
### Execution time: <15 seconds (if all packages installed);  ~10-30 minutes (if all packages need to be installed)

### Inputs: none

### Outputs: 
# 1) /data/clean/execution_times.csv




#
# SET-UP -----------------------------------------------------------------------------------------------
#

t0 = Sys.time() # set-up start time



### Language  -----------------------------------------------------------------------------------------------
Sys.setlocale('LC_ALL', 'Spanish_Spain.1252')   # Spanish

# rm(list=ls())

### make a copy of the file    -----------------------------------------------------------------------------------------------
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_archive', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)



### set path   -----------------------------------------------------------------------------------------------
# System <- Sys.getenv(x = NULL, unset = "")

if(Sys.info()[['user']] == 'idhsrol'){
  main_dir = dirname(dirname(rstudioapi::getActiveDocumentContext()$path)) # 'S:/Payroll_Analytics_in_Latin_America/Country'
}else if(Sys.info()[['user']] == 'robli'){
  main_dir = dirname(dirname(rstudioapi::getActiveDocumentContext()$path))  # '~/RA (Christian Schuster)/IADB_Country'
}
### NEW USERS CAN ADD THEIR IDENTIFIERS AND PATHS HERE

setwd(main_dir) # set working directory to the main folder



#
# PACKAGES -----------------------------------------------------------------------------------------------
#

### Load all required packages (pacman installs the ones still not present)


### UCL Data Save Haven Advisory has instructions for installing R packages in the DHS. The below
### token needs to be generated individually for each user. 
### The name of package to be installed needs to be specified as the 1st argument


# if(Sys.info()[['user']] == 'idhsrol'){
#   install.packages("rio", 
#                    repo="https://artifactory.idhs.ucl.ac.uk/artifactory/cran", 
#                    headers=c(Authorization="Bearer cmVmdGtuOjAxOjE3ODI3Mzc2MDc6c3VFeTRqVzgyM0JzRVBvRXFNZHFmbjJqSWJL"))
# }else if(Sys.info()[['user']] == 'XXXX'){
#   # install.packages("PACKAGE_NAME", 
#   #                  repo="https://artifactory.idhs.ucl.ac.uk/artifactory/cran", 
#   #                  headers=c(Authorization="Bearer cmVmdGtuOjAxOjE3ODI3Mzc2MDc6c3VFeTRqVzgyM0JzRVBvRXFNZHFmbjJqSWJL"))
# }


if( !is.element("pacman", installed.packages() )){
  install.packages("pacman", dep= T)
}


# devtools::install_github("rstudio/addinexamples", type = "source")

  
pacman::p_load(tidyverse, haven, stringr,
               janitor, data.table, ggplot2, stringi, dplyr,
               # pacforeign, 
               labelled, fastDummies, car, arrow, viridis,
               lubridate, scales, purrr, openxlsx, 
               stargazer,  ggpubr, paletteer, grid,
               gridExtra, patchwork,cowplot,
               tidylog, hunspell, future, ids,
               knitr, sf, readxl, broom,
               ggtext, psych, tm, ggridges, 
               tableone, zoo, rlang, stringfish, microbenchmark,
               gender, genero, tm, stringdist, fuzzyjoin, beepr,
               collapse, pryr, tools, scales, qs,
               fixest, openxlsx2, tidyxl, unpivotr, writexl,
               hunspell, skimr, gender, genero,
               # genderdata, 
               microbenchmark, DescTools,
               diffr,
               update = F)



#
# SETTINGS ------------------------------------------------------------------------------------------
#

### disambiguate functions
### make sure common functions are taken from the right packages
select <- dplyr::select
filter <- tidylog::filter # disables tidylog
shift  <- data.table::shift
# clean_names <- janitor::clean_names()


### number of columns shown in Viewer()
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 1000L)



# set number of threads used by DT to maximum possible (not sure this does something, but shouldn't hurt)
# also should speed up .qs by ~40%
setDTthreads(percent = 100)
nthreads1 = getDTthreads()
nthreads1 = floor(0.9*nthreads1) # use only 90% to avoid completely slowing down computer?




#
# KEY GLOBAL PARAMETERS  --------------------------------------------------------------------------------------------------
#

start_year = 2019
end_year   = 2024 # year(ymd(today()))

# need to define first and last months of observation because for those months we cannot
# tell whether someone is newly hired or leaves respectively
start_date1 = ymd(paste(start_year, '01', '01', sep = '-'))
end_date1   = ymd(paste(end_year, '12', '31', sep = '-'))

# minimum size of a group to include indicator value in the final dashboard
# (e.g. if we want average pay values for women in org A in region X in 2024 and there are less than 'min_size'
# with such set of characteristics, as defined by sum of FTEs,  then we suppress/omit value for this group)
min_size = 10

# file format - define the default file format to be used for storing the data 
# of course individual files can be saved/read in different format, this is the default used
# whenver 'read_flex()' and 'write_flex()' custom functions are used (see their definition below)
format1 = 'parquet' # csv / xlsx / parquet / rds / qs (can be expanded in functions' definitions as needed)


# turn off scientific notation
options(scipen = 999)

sound_error = T # should each error make a distinct sound?
if(sound_error){
  options(error = function() beep=7)
}

# country name
country1 = 'country'

#
# DECIDE WHETHER TO RE-RUN ---------------------------------------------------------------------------------------
#

anew_bind_raw = T # should the script that binds raw files (in .parquet format) be run anew?



#
# CUSTOM FUNCTIONS ------------------------------------------------------------------------------------------
#

### Create new functions that will be handy to use throughout the scripts

## basic ----------------------------------------------------------------------------------------------------------------
mean_miss <- function(x){fmean(x, na.rm = TRUE)} # quick mean, without NAs (same in the below)
median_miss <- function(x){fmedian(x, na.rm = TRUE)}
sum_miss  <- function(x){fsum(x, na.rm = TRUE)}
sd_miss   <- function(x){fsd(x, na.rm = TRUE)}
min_miss  <- function(x){fmin(x, na.rm = TRUE)}
max_miss  <- function(x){fmax(x, na.rm = TRUE)}

fdistinct <- function(x){return(length(funique(x)))}

# based on the basic  ----------------------------------------------------------------------------------------------------------------
lower_ci  <- function(x){mean_miss(x) - 1.96 * (sd_miss(x) / sqrt(length(x)))}
upper_ci  <- function(x){mean_miss(x) + 1.96 * (sd_miss(x) / sqrt(length(x)))}
mode_miss <- function(x) {
  ux <- unique(na.omit(x))
  ux[which.max(tabulate(match(x, ux)))]
}

## custom tabulations   ----------------------------------------------------------------------------------------------------------------
sf        <- function(x){return(summary(factor(x)))} # print tabular summary of a vector
pr        <- function(x){return(prop.table(table(x, useNA = "no"))*100)} # proportions table (in %)
pr_na     <- function(x){return(prop.table(table(x, useNA = "ifany"))*100)} # proportions with NAs (in %)
pr_isna   <- function(x){return(pr(is.na(x)))} # print proportions of missing vs non-missing obs (in %)

# return top N values with % in descending order
pr_top = function(dta, var, n1 = 10){
  return(
    base::print(
    dta %>%
      ungroup %>% 
      select(-any_of(c('n'))) %>% 
      count(!!sym(var), sort = TRUE) %>%  # count each category and sort descending
      mutate(percentage = 100 * n / sum(n)) %>%  # calculate percentage
      slice_head(n=n1))
  )
}


table_na  <- function(x, y){return(format(table(x, useNA = "ifany")), big.mark = ',')} # table, pretty format, with NAs
table_s   <- function(x, s = F){return(sort(table(x, useNA = 'ifany'), decreasing = s))} # sorted table with NAs

# table_s   <- function(x, s = F){return(prettyNum(sort(table(x, useNA = 'ifany'), decreasing = s), big.mark = ','))} # with big.mark -> gives characters rather than numbers
pr_s   <- function(x, s = F){return(prop.table(sort(table(x, useNA = 'ifany'), decreasing = s)))} # sorted proportional table (in %)

## longer helpers  ----------------------------------------------------------------------------------------------------------------

source_global = function(x){
  source(list.files(
    path = dirname(rstudioapi::getActiveDocumentContext()$path),
    pattern = "global.*\\.R$",
    full.names = TRUE
  ))
}



# functions to flexibly adjust format of file being saved / read
# NOTE: first .parquet files were used, then .rds, then .qs -> all have some drawbacks and advantages,
# so this should limit the amount of changes in the code that are needed to adjust, as one function can be used
# to save/write and the right format can be specified in the global file


read_flex = function(file, format  = format1, sheet = 1, col_select = NULL){
  
  if(file_ext(file) != ''){
    file_full = file  # use file extension if one present in file name
    format = file_ext(file)
  }else{
    file_full = paste0(file, '.', format) # else add format defined in the function 
  }
  
  
  if(format == 'csv'){dta = fread(file_full, encoding = 'UTF-8')}
  if(format == 'xlsx'){dta = openxlsx2::read_xlsx(file_full, sheet)}
  if(format == 'parquet' & is.null(col_select)){dta = arrow::read_parquet(file_full)}
  if(format == 'parquet' & !is.null(col_select)){dta = arrow::read_parquet(file_full, col_select = col_select)}
  if(format == 'rds'){dta = readRDS(file_full) }
  if(format == 'qs'){dta = qs::qread(file_full, nthreads = nthreads1) }
  
  return(dta)
}

write_flex = function(x, file, format = format1){
  
  if(file_ext(file) != ''){
    file_full = file  # use file extension if one present in file name
    format = file_ext(file)
  }else{
    file_full = paste0(file, '.', format) # else add format defined in the function 
  }
  
  if(format == 'csv'){dta = fwrite(x, file_full, row.names = F, na = NA, encoding = 'UTF-8')}
  if(format == 'xlsx'){dta = openxlsx::write.xlsx(x, file_full)}
  if(format == 'parquet'){dta = arrow::write_parquet(x, file_full, compression = 'zstd')}
  if(format == 'rds'){dta = saveRDS(x, file_full) }
  if(format == 'qs'){dta = qs::qsave(x, file_full, nthreads = nthreads1) }
  
}


# instructions file (define function to read and clean it -> helps to update the object if any manual changes in the files are 
# made during testing below)



read_instructions = function(x){
  
  instructions <- openxlsx2::read_xlsx(file.path(main_dir, 'data', 'clean', 'country_instructions.xlsx'), sheet = 'instructions') %>%
    slice(-(1:2)) %>% 
    mutate(#fila = as.numeric(row_number()),
      across(everything(), ~trimws(gsub("\\s+", " ", .) )),  # ensure there are no white spaces ) %>% 
      fila = as.numeric(fila)) %>% # row number needs to be treated as numeric (mind, it needs to go after the above trimming code) 
    # rownames_to_column(var = 'fila') %>% relocate(row, .before = 1) %>%  # save original row number to more easily locate the rows in the Excel when making edits (now added in Excel directly)
    filter(grepl('[0-9]|promedio', anyo_fin)) %>%  # removes both NA's and non-Spanish headers (English translations and explanations for the users)
    filter(!grepl('xxx', variable_tipo))  # removes both NA's and indicators that are not possible to define in some countries and/or
  # with current data at hand, but are kept because (i) they can be calculated in other countries (ii) avoid confusion about 
  # the reason for discrepancy with indicators originally requested for the dashboard
  
  
  assign(paste0(country1, "_instructions"), instructions, envir = .GlobalEnv)
  
}

# run below to already load the instructions for the indicators as 'country_instructions
read_instructions() 


### Match with Christian's comments from 28/07/2025 (for Country)
# update1 =  openxlsx2::read_xlsx(file.path(main_dir, 'country_instructions - CS review.xlsx'), sheet = 'instructions') %>% 
#   slice(-(1:2)) %>% 
#   mutate(across(everything(), ~trimws(gsub("\\s+", " ", .) )),
#          fila = as.numeric(fila)) %>%
#   select(c(fila, matches('indicador'))) %>% 
#   rename(indicador_update = indicador) %>% 
#   filter(!is.na(fila) & !(indicador_eng == 'xxx'))
# 
# old1 = openxlsx2::read_xlsx(file.path(main_dir,  'data', 'clean', 'country_instructions.xlsx'), sheet = 'instructions') %>%
#   slice(-(1:2)) %>% 
#   mutate(across(everything(), ~trimws(gsub("\\s+", " ", .) )),
#          fila = as.numeric(fila)) %>%
#   select(c(fila, matches('indicador'))) %>% 
#   filter(!is.na(fila))
# 
# sf(update1$indicador_eng %in% old1$indicador_eng)
# update1$indicador_eng[!update1$indicador_eng %in% old1$indicador_eng]
# 
# temp = left_join(old1, distinct(update1 %>% select(-c(fila))), by = 'indicador_eng') %>% 
#   mutate(indicador_update = ifelse(indicador_update == indicador, '', indicador_update))
# 

### NOTE: save and the copy the new names column temporarily to the instructions file to compare old and new names (copy-paste)
### new names instead of old ones where necessary
# write_flex(temp, file.path(main_dir, 'data', 'intermediate_temp', 'country_instructions_temp.csv'))



# define a function to convert the dates

# temp1 = unique(country$date_terminate) #[!is.na(country$date_terminate)])
# summary(str_count(temp1))
# 
# i = '2022-03-11'
# for(i in as.character(temp1)){
#   print(i)
#   convert_to_date(i) %>% print
# }
# 
# sf(grepl('a-z', tolower(temp1)))


date_column = c( "2021-01-01", "2022-04-25", "2017-01-01", '20170201', NA, '2017-02-01 00:00')

convert_to_date <- function(date_column) {
  
  # try both ymd (first) and dmy
  date1 <- ymd(date_column, quiet = TRUE)
  
  # if ymd fails (produces NAs), try dmy where date1 is NA
  date1[is.na(date1)] <- dmy(date_column[is.na(date1)], quiet = TRUE)
  
  # if still NAs produces, try finding date-like string formats
  remaining_na <- is.na(date1)
  
  if (any(remaining_na)) {
    
    # search for substrings (dd[]mm[]yyyy)
    extracted_dates <- str_extract(date_column[remaining_na], "\\d{1,2}[/-]\\d{1,2}[/-]\\d{4}")
    date1[remaining_na & !is.na(extracted_dates)] <- dmy(extracted_dates)
    
    # search for substrings (yyyy[]mm[]dd)
    extracted_dates <- str_extract(date_column[remaining_na], "\\d{4}[/-]\\d{1,2}[/-]\\d{1,2}")
    date1[remaining_na & is.na(date1)] <- ymd(extracted_dates)
  }
  
  return(as.Date(date1))
  
}


str_wrap_br = function(x, n){ # wrap string with <br> instead of \n (mainly for ggtext labels in ggplot)
  x = gsub('\n', '<br>', str_wrap(x, n), fixed=T)
  return(x)
}


# function to clean unique values
clean_text <- function(vec) {
  vec <- tolower(str_trim(vec))  # Lowercase + trim whitespace
  vec <- fifelse(str_detect(vec, "^[[:space:][:punct:]]*$"), NA_character_, vec)  # Empty -> NA
  vec <- fifelse(vec == "", NA_character_, vec)  # Empty -> NA
  vec <- stri_trans_general(vec, "Latin-ASCII")  # Remove Spanish diacritics
  vec <- str_replace_all(vec, '\\s+', ' ')  # Replace multiple spaces with single space
  vec <- str_replace_all(vec, '[^ -~]+', '')  # Remove hard spaces and special characters
  return(vec)
}





# dummy lines to use both read/write functions above
# country = read_flex(file = file.path('data', 'intermediate', 'xxxxx'), format = format1)
# write_flex(x = country, file = file.path('data', 'intermediate', 'xxxxxx'), format = format1)

#### quicker combining of data frames using data.table syntax, but simiplified to work as left_join()
### (on 20 million rows data.frames with several columns each, left_join takes ~3x longer)
left_join_df = function(df1, df2){
  if(!any(grepl('data.table', class(df1)))){setDT(df1)}
  if(!any(grepl('data.table', class(df2)))){setDT(df2)}
  
  match_cols = intersect(names(df1), names(df2))
  
  df_merged = df2[df1, on = match_cols]
  
  return(df_merged)
}

deselect = function(dta, cols){
  dta[, (intersect(names(dta), cols)) := NULL ]
}


# to title case, but omitting stopwords (Spanish)
title_case_spanish <- function(text_vec) {
  # List of connectors not to capitalize unless first word
  connectors <- c("y", "o", "la", "las", "el", "los", "un", "una", "unos", "unas",
                  "de", "del", "al", "en", "con", "por", "para", "a", "e", "u", "ni", "que", "se")
  
  # Apply logic to each string in the vector
  sapply(text_vec, function(text) {
    words <- strsplit(tolower(text), " ")[[1]]
    capitalized <- sapply(seq_along(words), function(i) {
      word <- words[i]
      if (i == 1 || !(word %in% connectors)) {
        paste0(toupper(substr(word, 1, 1)), substr(word, 2, nchar(word)))
      } else {
        word
      }
    })
    paste(capitalized, collapse = " ")
  }, USE.NAMES = FALSE)
}


### extract min and max year-month dates from a file in mmm_yy format (to name files in the folders more cleanly)

# create 3-letter codes for Spanish names
spanish_months <<- c("ene", "feb", "mar", "abr", "may", "jun", 
                     "jul", "ago", "sep", "oct", "nov", "dic")


dates_my_esp = function(dta, anyo1 = 'anyo', mes1 = 'mes'){
  
  
  # check if pre-specified columns in the data
  if(!anyo %in% names(dta)){stop("Code execution stopped: specified 'anyo' variable not in the data")}
  if(!mes %in% names(dta)){stop("Code execution stopped: specified 'mes' variable not in the data")}
  
  # extract unique year-month combinations
  file_dates = dta %>% select(c(anyo1, mes1)) %>% distinct
  
  # ensure names are anyo and mes
  names(file_dates) = c('anyo', 'mes')
  
# Format: 3-letter month in Spanish + 2-digit year
format_ym <- function(m, y) {
    paste0(spanish_months[m], "", sprintf("%02d", y %% 100))
  }
  
  file_my_start <<- format_ym(min(file_dates$mes[file_dates$anyo == min(file_dates$anyo)]), min(file_dates$anyo))
  file_my_end   <<- format_ym(max(file_dates$mes[file_dates$anyo == max(file_dates$anyo)]), max(file_dates$anyo))
  
  # Combine into filename suffix
  file_name_suffix <<- paste0(file_my_start, "_", file_my_end)
  
  print(paste0('DONE: new start date saved as file_my_start (', file_my_start, '), new end date as file_my_end (', file_my_end,
               '), and file name suffix as file_name_suffix (', file_name_suffix, ')'))
}



extract_parentheses <- function(x) {
  str_extract_all(x, "\\(([^()]*)\\)")[[1]] |> 
    str_replace_all("[()]", "")  # remove the parentheses
}
extract_parentheses("Employee (HR) worked in (Finance) and (IT)")


name_check <- function(x, text1){names(x)[grepl(text1, names(x))]} # check if string in column names of df

see <- function(x, n){return(x[sample(1:nrow(x), n),])} # see random set of df rows

# random sample of rows from a column
row_sample = function(x, n  = 25){
  return(sample(x,n))
}

# flexible rounding
round_flex = function(x){
  if(abs(x) < 1){
    x = sprintf("%.3f",round(x, 3))  # to 3 decimal places if <1
  }else if(abs(x) >= 1  & abs(x) < 10){
    x = sprintf("%.2f",round(x, 2))  # to 2 decimal places if <10
  }else if(abs(x) >= 10 & abs(x) < 100){
    x = sprintf("%.1f",round(x, 1))  # to 1 decimal places if <100
  }else if(abs(x) >= 100){
    x= sprintf("%.0f",round(x, 0))  # to 0 decimal places ig >1000
  }
  
  return(x)
}


# Function to find observations above X st. dev. from the mean
outlier_sd = function(data, var, x){
  lower = mean_miss(data[,var]) - x*sd_miss(data[, var])
  upper = mean_miss(data[,var]) + x*sd_miss(data[, var])
  return(which(data[,var] < lower | 
                 data[,var] > upper))
}
#outlier_sd(dta, 'covid_vaccine', 3)


# Compare present in two data frames
diff_col = function(x, y){
  
  colx = colnames(x)
  coly = colnames(y)
  
  onlyx = setdiff(colx, coly)
  onlyy = setdiff(coly, colx)
  
  cat('COLUMNS ONLY IN X:\n', paste(onlyx, collapse = ', '), '\n')
  cat('COLUMNS ONLY IN Y:\n', paste(onlyy, collapse = ', '), '\n')
  
}

# Function to summarize all columns in the dataset
sf_dataset = function(data){
  for(var1 in names(data)){
    print(var1)
    data = data %>% mutate(value = !!rlang::ensym(var1))
    print(sf(data$value))
  }
}

# Add significance starts to numbers based on p-values (for tables to LaTeX / Overleaf)
sig_stars = function(var){
  
  var = ifelse(var == '<0.001', round(0.00, 2), round(as.numeric(var), 3))
  
  if(is.numeric(var)){
    var = ifelse(test = var < 0.10 & var >= 0.05, yes  = paste0(var, '$^{\\dotr}$'), no = var)
    var = ifelse(test = var < 0.05  & var >= 0.01, yes  = paste0(var, '$^{*}$'), no = var)
    var = ifelse(test = var < 0.01  & var >= 0.001, yes  = paste0(var, '$^{**}$'), no = var)
    var = ifelse(test = var < 0.001, yes  = paste0(var, '$^{***}$'), no = var)  
  }
  
  return(var)
}



# Extract coefficients from the model into a (semi-)clean LaTeX row
extract_coeftest = function(m1, length1){
  
  ### Extract coefficients
  
  # If OLS
  if(class(m1) %in% c('lm')){
    temp = summary(m1)$coefficients
    temp = data.frame(beta = temp[,1], se = temp[,2], p_value = temp[,4])
  }
  if(class(m1) %in% c('coeftest')){
    temp = temp = data.frame(beta = m1[,1], se = m1[,2], p_value = m1[,4])
  } 
  if(class(m1) %in% c('iv_robust')){
    temp = data.frame(beta = m1$coefficients, se = m1$std.error, p_value = m1$p.value)
  }
  
  
  # length1 = 2
  # m1 = lm(var ~  class_code * ecm_include_patient, data = dta_reg)
  # m1 = coeftest(m1, cluster.vcov(m1, dta_reg$list_id, df_correction = T))
  # temp = data.frame(beta = m1[,1], se = m1[,2], p_value = m1[,4])
  
  temp = add_column(temp, .before = 1, 'var' = rownames(temp))
  temp = temp[(nrow(temp)-length1):nrow(temp),]
  
  ### Round numbers
  temp$beta = sapply(temp$beta, round_flex)
  temp$se   = str_trim(paste0('(', sapply(temp$se, round_flex), ')'))
  
  ### Add significance stars
  temp$beta = ifelse(test = temp$p_value < 0.10 & temp$p_value >= 0.05, yes  = paste0(temp$beta, '$^{\\dotr}$'), no = temp$beta)
  temp$beta = ifelse(test = temp$p_value < 0.05  & temp$p_value >= 0.01, yes  = paste0(temp$beta, '$^{*}$'), no = temp$beta)
  temp$beta = ifelse(test = temp$p_value < 0.01  & temp$p_value >= 0.001, yes  = paste0(temp$beta, '$^{**}$'), no = temp$beta)
  temp$beta = ifelse(test = temp$p_value < 0.001, yes  = paste0(temp$beta, '$^{***}$'), no = temp$beta)
  
  
  ### To long
  temp$var_clean = temp$var
  temp = gather(temp, key, beta, -c(var, var_clean, p_value, se))
  temp = temp[order(temp$var), ]
  
  ### Clean variable (row) names
  #temp = temp %>% mutate(across(c(var_clean), ~ paste0('\\multirow{2}{*}{', .x, '}')))
  #temp$var_clean[seq(2,nrow(temp), 2)] = ''
  
  
  ###  Put minuses in $ signs, as otherwise they won't print correctly in LaTeX
  temp$beta = gsub('-', '$-$', temp$beta, fixed=T)
  
  ###  Put all columns in one dataframe column with LaTeX table separators
  # temp$cell1 = paste0(apply(temp[,c('var_clean', 'beta')], 1, paste, collapse = "&"), '\\')
  
  ### Final selection of columns
  temp = temp %>% dplyr::select(c(var, beta))
  
  ###  Return
  return(temp)
}




#
# PLOT THEMES --------------------------------------------------------------------------------------------------
#

### Country flag
# yellow: #ffd100
# blue: #0072ce
# red: #ef3340

update_geom_defaults("point", list(colour = "#009da7"))
update_geom_defaults("bar", list(fill = "#009da7", colour = "black"))

### set manually
palette1 = c(
  '#006276', #(dark turquoise)
  '#009da7', # (light turquoise)
  '#db551e', #(orange)
  '#8a171a', #(darkorange)
  '#fdb414', #(gold)
  '#b2902f', #(tan)
  '#B6F6FA' # (lighter turquoise)
)


### set manually
palette1_expand = c(
    '#8a171a', 
    '#C4474A', 
    '#db551e', 
    '#E68760', 
    '#b2902f',
    '#fdb414', 
    '#F0C743', 
    '#F0DB7F', 
    '#A2E0D5', 
    '#00828e',
    '#006276', 
    '#003b49',
    '#021B21'  
  )
  


### Handbook palette (darkblue-orange)  ----------------------------------------------------------------------------------------------------------------

palette2 = c('#fcb315', #(orange/gold)
             '#052c5d', # (darkblue)
             '#db551e', # (darkorange/)
             '#7ec1e4', # (lightblue)
             '#ed135f', # (pink)
             '#647aad', # (blue-gray)
             '#E2BDB0', # ('pale dogwood' (light peach))
             '#92B3DB' # (light blue-gray)
)

### Define default ggplot theme

theme_set(  theme_bw() +
              theme(
                plot.title = element_markdown(color = 'black', size = 33, hjust = 0.5),
                plot.subtitle = element_markdown(color = 'grey15', size = 32, hjust = 0.5),
                plot.caption= element_textbox_simple(face = 'plain', color = 'grey30', size = 19, hjust = 0),
                
                
                axis.text.x = element_markdown(size = 20),
                axis.text.y = element_markdown(size = 20),
                axis.title.x = element_markdown(size = 20),
                axis.title.y = element_markdown(size = 20),
                
                axis.line = element_line(),
                axis.ticks = element_line(),
                
                legend.direction = 'horizontal',
                legend.position = 'bottom',
                legend.title = element_text(size = 34, hjust = 0.5),
                legend.text = element_text(size = 30, hjust = 0),
                
                legend.key.spacing.x = unit(3, 'cm'),
                
                plot.background = element_rect(fill = 'white', color = 'NA'),
                panel.background = element_rect(fill = 'white', color = 'NA'),
                panel.border = element_blank(),
                panel.grid.major = element_blank(), #remove major gridlines
                panel.grid.minor = element_blank(), #remove minor gridlines
                
                strip.background = element_rect(color = NA, fill = NA),
                strip.text = element_markdown(size = 20),
              )
            
)


theme_small = theme_bw() +
  theme(
    plot.title = element_markdown(color = 'black', size = 13, hjust = 0.5),
    plot.subtitle = element_markdown(color = 'grey15', size = 12, hjust = 0.5),
    plot.caption= element_textbox_simple(face = 'plain', color = 'grey30', size = 9, hjust = 0),
    

    axis.text.x = element_markdown(size = 10),
    axis.text.y = element_markdown(size = 10),
    axis.title.x = element_markdown(size = 10),
    axis.title.y = element_markdown(size = 10),
    
    axis.line = element_line(),
    axis.ticks = element_line(),
    
    legend.direction = 'horizontal',
    legend.position = 'bottom',
    legend.title = element_text(size = 14, hjust = 0.5),
    legend.text = element_text(size = 10, hjust = 0),
    
    legend.key.spacing.x = unit(1.5, 'cm'),
    
    plot.background = element_rect(fill = 'white', color = 'NA'),
    panel.background = element_rect(fill = 'white', color = 'NA'),
    panel.border = element_blank(),
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    
    strip.background = element_rect(color = NA, fill = NA),
    strip.text = element_markdown(size = 10),
  )



#
# EXTRACT R FILES --------------------------------------------------------------------------------------------------------
#
### easily extract R files from all folders (including non-Country IADB countries)
### for creating safety copies

anew_extract_r = F

if(anew_extract_r){
  
  # Get all .R files excluding those in paths containing 'archive'
  r_files <- list.files(
    path = dirname(main_dir), 
    pattern = "\\.R$", 
    recursive = TRUE, 
    full.names = TRUE
  )
  
  # Filter out files in folders with 'archive' or '- copy' in their path
  r_files <- r_files[!grepl("archive", r_files, ignore.case = TRUE)]
  r_files <- r_files[!grepl("- copy", r_files, ignore.case = TRUE)]
  
  
  # Create a temporary folder to copy the files
  temp_folder <- file.path(tempdir(), "r_files_zip")
  dir.create(temp_folder, showWarnings = FALSE)
  
  # Copy all selected .R files into the temp folder
  file.copy(r_files, temp_folder, overwrite = TRUE)
  
  # Set zip file path
  zip_path <- file.path(dirname(main_dir),  paste0("code_iadb_", format(Sys.time(), "%d_%m_%Y"), '.zip'))
  
  # Create the zip archive
  zip(zipfile = zip_path, files = list.files(temp_folder, full.names = TRUE), flags = "-j")  # -j to omit directory paths
  
  cat("Zipped", length(r_files), "R files to", zip_path, "\n")
  
  
}


# EXECUTION TIMES DATA.FRAME-----------------------------------------------------------------------------------------------------------------------------------------------------------

## NOTE: screate dataframe recording script execution times - each script will assign new row with its own execution time

# cols to be saved - script name; execution time in seconds;
# execution time on 5 minute segments (to give more approximate overview for the README);
# date of last execution



exec_time_fun = function(name1){
  
  t1 = Sys.time() # record end time
  
  if(exists('t0')){time_diff = as.numeric(difftime(t1, t0, units = "secs"))}
  if(!exists('t0')){time_diff = 'Start time not defined'}


  exec_time = read_flex(file.path('data', 'clean', name1), format='csv')
  
  exec_time = rbind(exec_time,
                    data.frame(script = basename(rstudioapi::getActiveDocumentContext()$path), 
                               time = ifelse(exists('t0'), round(as.numeric(time_diff), 0), time_diff), 
                               time_5 = ifelse(exists('t0'), round(as.numeric(time_diff)/(5*60), 0), time_diff),
                               last_run = Sys.time())) 
  
  exec_time$last_run = as.POSIXct(exec_time$last_run, tz = Sys.timezone())
  
  exec_time = exec_time %>% group_by(script) %>% filter(last_run == max(last_run)) %>%  
    ungroup() %>% filter(grepl('country', script)) %>%  arrange(script)
  
  print(exec_time)  
  write_flex(exec_time, file.path('data', 'clean', name1), format='csv')
}


if(!file.exists(file.path('data', 'intermediate', 'exec_time_raw.csv'))){ # create one if run for the first ime
  
  exec_time = data.frame(script = basename(rstudioapi::getActiveDocumentContext()$path), 
                         time = round(as.numeric(time_diff), 0), 
                         time_5 = round(as.numeric(time_diff)/60, 0),
                         last_run = Sys.time()) 
  
  # needs to be done to record local time, instead of it defaulting to UTC+0
  exec_time$last_run = as.POSIXct(exec_time$last_run, tz = Sys.timezone())
  
  write_flex(exec_time, file.path('data', 'intermediate', 'exec_time_raw'), format='csv')
  
}else{ # otherwise add and select the entry with most recen run time
  exec_time_fun('exec_time')
}



### list number of lines of code in each of the key scripts
all_r = list.files(path = 'code', pattern = '\\.R$', full.names=T)
all_r = all_r[!grepl('\\/[9XZ]', all_r, fixed=F)]

all_r_count = sapply(all_r, function(f) length(readLines(f, warn = FALSE))) %>% print
sum(all_r_count)


### convert .parquet to .rds if not present yet -------------------------------------------------------------------------------------------------------
# path1 = file.path(main_dir, 'data', 'intermediate')
# 
# files_parquet1 = list.files(path1, pattern = '\\.parquet', full.names = F)
# files_rds1 = list.files(path1, pattern = '\\.rds', full.names = F)
# 
# require(tools)
# missing_rds = files_parquet1[!file_path_sans_ext(basename(files_parquet1)) %in% file_path_sans_ext(basename(files_rds1))]
# 
# for(p in missing_rds){
#   print(p)
#   temp = read_parquet(file.path(path1, p))
#   saveRDS(temp, file.path(path1, gsub('\\.parquet', '\\.rds', p)))
# }


### convert .csv to .qs if not present yet -------------------------------------------------------------------------------------------------------
# path1 = file.path(main_dir, 'data', 'raw_csv')
# 
# files_csv1 = list.files(path1, pattern = '\\.csv', full.names = F)
# files_qs1 = list.files(path1, pattern = '\\.qs', full.names = F)
# 
# require(tools)
# missing_qs = files_rds1[!file_path_sans_ext(basename(files_rds1)) %in% file_path_sans_ext(basename(files_qs1))]
# 
# for(p in missing_qs){
#   print(p)
#   temp = readRDS(file.path(path1, p))
#   qs::qsave(temp, file.path(path1, gsub('\\.rds', '\\.qs', p)))
# }




#
# FIN DEL CÓDIGO --------------------------------------------------------------------------------------------------------
#